import pandas as pd
amazon = pd.read_csv('amazon_comment_seaweed_chip_result.csv') # 아마존 데이터
amazon = amazon.drop(amazon.columns[0], axis = 1)
amazon.head()
# comment_date 자르기
def name_cut(x):
x = x.split('on ')[1]
return x
amazon["comment_date"] = amazon["comment_date"].apply(name_cut)
amazon.head()
# 평점 자르기
def name_cut(x):
x = x.split(" out")[0]
x = float(x)
return x
amazon["star_rating"] = amazon["star_rating"].apply(name_cut)
amazon.head()
amazon.to_csv('amazon.csv')
walmart = pd.read_csv('walmart.csv') # 월마트 데이터
walmart.head()
youtube_1 = pd.read_excel('youtube_script1.xlsx') # 유투브 데이터 1
youtube_1 = youtube_1.drop(youtube_1.columns[0], axis = 1)
youtube_1.head()
youtube_2 = pd.read_excel('youtube_script2.xlsx') # 유투브 데이터 2
youtube_2 = youtube_2.drop(youtube_2.columns[0], axis = 1)
youtube_2.head()
# 유투브 전체 데이터
youtube = pd.concat([youtube_1['text'], youtube_2['text']], axis = 0) ; youtube
youtube = pd.DataFrame({'text': youtube})
youtube.head()
# 전체 데이터
data = pd.concat([amazon['comment'], walmart['comment'], youtube_1['text'], youtube_2['text']], axis = 0) ; data
data = pd.DataFrame({'comment': data})
data.head()
data.to_csv('full_data.csv')
len(data)
# 아마존 + 월마트
review = pd.concat([amazon['comment'], walmart['comment']], axis = 0) ; review
score = pd.concat([amazon['star_rating'], walmart['score']], axis = 0) ; score
author = pd.concat([amazon['comment_author'], walmart['author']], axis = 0) ; author
date = pd.concat([amazon['comment_date'], walmart['date']], axis = 0) ; date
title = pd.concat([amazon['comment_title'], walmart['title']], axis = 0) ; title
review = pd.DataFrame({'date': date, 'title' : title, 'review': review, 'score': score, 'author': author})
review.head(15)
review.to_csv('amazon_walmart_full_data.csv')
len(review)
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import numpy as np
#text cleaning
stop_words = set(stopwords.words("english"))
stop_words.update(("love", "loves", "great", "good", "better", "amazing", "buying", "found", "get", "like", "loved", 'maybe', 'may', 'could', 'awesome',
'definitely', 'perfect' , 'best', 'okay', 'excellent', 'disappointed', 'right', 'nice', 'however', 'maybe', 'pretty', 'thank', 'wonderful', 'terrible',
'unfortunately', 'awful', 'horrible', 'worst', 'wont', 'surprised', 'one', 'bad', 'actually', 'really', 'would',
'everybody', 'sometime', 'state', 'generally', 'edge', 'report', 'gross'
))
def text_cleaner(text,num):
newString=str(text).lower()
newString = BeautifulSoup(newString, "lxml").text
newString = re.sub(r'\([^)]*\)', '', newString)
newString = re.sub('"','', newString)
newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
newString = re.sub(r"'s\b","",newString)
newString = re.sub("[^a-zA-Z]", " ", newString)
newString = re.sub('[m]{2,}', 'mm', newString)
if(num==0):
tokens = [w for w in newString.split() if not w in stop_words]
else:
tokens=newString.split()
long_words=[]
for i in tokens:
if len(i)>1: #removing short word
long_words.append(i)
return (" ".join(long_words)).strip()
contraction_mapping = {"won't": "will not", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
"didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
"he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
"I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
"i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
"it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
"mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
"mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
"oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
"she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
"should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
"this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
"there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
"they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
"wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
"we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
"what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
"where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
"why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
"would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
"you're": "you are", "you've": "you have"}
cleaning_text = []
for r in review['review']:
cleaning_text.append(text_cleaner(r,0))
review['cleaned_text']=cleaning_text
review['cleaned_text'].head()
cleaning_text = []
for r in youtube['text']:
cleaning_text.append(text_cleaner(r,0))
youtube['cleaned_text']=cleaning_text
youtube['cleaned_text'].head()
#visulazie the noumber of word
import matplotlib.pyplot as plt
#counting
n_of_text = []
for i in review['cleaned_text']:
n_of_text.append(len(i.split()))
length = pd.DataFrame({'text' : n_of_text})
length.hist(bins = 80)
plt.rc('figure', figsize=(10, 5))
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(data = review, x = 'score')
plt.rc('figure', figsize=(10, 5))
plt.show()
review.groupby("score").count()
from nltk.probability import FreqDist
fdist = FreqDist(review['cleaned_text'])
print(fdist)
fdist.most_common(2)
fdist.plot(30,cumulative=False)
plt.show()
fdist = FreqDist(youtube['cleaned_text'])
print(fdist)
fdist.most_common(2)
fdist.plot(30,cumulative=False)
plt.show()
import nltk
plt.figure(figsize=(14,8))
myText = nltk.Text(review['cleaned_text'])
topics = ['snack', 'salty', 'kids', 'healthy snack', 'expensive']
myText.dispersion_plot(topics)
for topic in topics:
freqdist = nltk.FreqDist(review['cleaned_text'])
print(topic,'more :', ' , '.join([ word.lower() for word, count in freqdist.most_common(5)]))
freqdist.plot(10)
tokenized_doc = review['cleaned_text'].apply(lambda x : x.split())
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
tagged_list = []
for t in tokenized_doc:
tagged_list.append(pos_tag(t))
pos_tag = pd.DataFrame({'tagged_list': tagged_list})
pos_tag.head()
def count_tags(title_with_tags):
tag_count = {}
for word, tag in title_with_tags:
if tag in tag_count:
tag_count[tag] += 1
else:
tag_count[tag] = 1
return(tag_count)
pos_tag['tagged_list'].map(count_tags).head()
pos_tag['tag_counts'] = pos_tag['tagged_list'].map(count_tags)
pos_tag.head()
tag_set = list(set([tag for tags in pos_tag['tag_counts'] for tag in tags]))
for tag in tag_set:
pos_tag[tag] = pos_tag['tag_counts'].map(lambda x: x.get(tag, 0))
title = 'Frequency of POS Tags in Review'
pos_tag[tag_set].sum().sort_values().plot(kind='barh', logx=True, figsize=(12,10), title=title)
vocabulary = {}
for row in pos_tag['tagged_list']:
for word, tag in row:
if word in vocabulary:
if tag in vocabulary[word]:
vocabulary[word][tag] += 1
else:
vocabulary[word][tag] = 1
else:
vocabulary[word] = {tag: 1}
vocabulary_df = pd.DataFrame.from_dict(vocabulary, orient='index')
vocabulary_df.fillna(value=0, inplace=True)
tag = 'NNP' # NNP: Proper noun, singular
vocabulary_df.sort_values(by=tag, ascending=False).head(10)
size = 25
tag = 'VBG' # VBG: Verb, gerund or present participle
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'VBD'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'NN'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'NNS'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'RB'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'RBR'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'JJ'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
size = 25
tag = 'JJR'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
# word2vec model
import multiprocessing
# 문자 벡터 차원 수
num_features = 300
# 최소 문자 수
min_word_count = 10
# 병렬 처리 스레드 수
num_workers = multiprocessing.cpu_count()
# 문자열 창 크기
context_size = 5
# seed 값
seed = 1
from gensim.models import word2vec
word2vec_model = word2vec.Word2Vec(tokenized_doc,
seed = seed,
workers = num_workers,
size = num_features,
min_count = min_word_count,
window = context_size)
print("word2vec vocabulary length: ", len(word2vec_model.wv.vocab))
# word2vec matrix
from __future__ import absolute_import, division, print_function
import numpy as np
count = 1000
word_vectors_matrix = np.ndarray(shape=(count, 300), dtype='float64')
word_list = []
i = 0
for word in word2vec_model.wv.vocab:
word_vectors_matrix[i] = word2vec_model.wv[word]
word_list.append(word)
i = i+1
if i == count:
break
print("word_vectors_matrix shape is ", word_vectors_matrix.shape)
# 2d로 차원 축소
import sklearn.manifold
tsne = sklearn.manifold.TSNE(n_components=2, init = 'pca', n_iter = 3500, random_state=0)
word_vectors_matrix_2d = tsne.fit_transform(word_vectors_matrix)
print("word_vectors_matrix_2d shape is ", word_vectors_matrix_2d.shape)
points = pd.DataFrame(
[(word, coords[0], coords[1])
for word, coords in [
(word, word_vectors_matrix_2d[word_list.index(word)])
for word in word_list
]],
columns = ['word', 'x', 'y'])
points.head(10)
sns.set_context('poster')
sns.scatterplot("x", "y", s=10, data = points)
def plot_region(x_bounds, y_bounds):
slice = points[
(x_bounds[0] <= points['x']) &
(points['x'] <= x_bounds[1]) &
(y_bounds[0] <= points['y']) &
(points['y'] <= y_bounds[1])
]
ax = slice.plot.scatter('x', 'y', s=35, figsize=(30, 28))
for i, point in slice.iterrows():
ax.text(point['x'] + 1, point['y'] + 1, point['word'], fontsize=11)
plot_region(x_bounds=(-100, 75), y_bounds=(-20, 20))
plot_region(x_bounds=(-100, -25), y_bounds=(-20, 20))
from sklearn.manifold import TSNE
def display_closestwords_tsnescatterplot(model, word):
arr = np.empty((0,300), dtype='f')
word_labels = [word]
# get close words
close_words = model.wv.similar_by_word(word)
# add the vector for each of the closest words to the array
arr = np.append(arr, np.array([model.wv[word]]), axis=0)
for wrd_score in close_words:
wrd_vector = model.wv[wrd_score[0]]
word_labels.append(wrd_score[0])
arr = np.append(arr, np.array([wrd_vector]), axis=0)
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(arr)
x_coords = Y[:, 0]
y_coords = Y[:, 1]
# display scatter plot
plt.scatter(x_coords, y_coords)
for label, x, y in zip(word_labels, x_coords, y_coords):
plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.xlim(x_coords.min()+0.005, x_coords.max()+0.005)
plt.ylim(y_coords.min()+0.005, y_coords.max()+0.005)
plt.rc('figure', figsize=(10, 8))
plt.show()
display_closestwords_tsnescatterplot(word2vec_model, 'snack')
word2vec_model.wv.most_similar("snack")
display_closestwords_tsnescatterplot(word2vec_model, 'healthy')
word2vec_model.wv.most_similar("healthy")
display_closestwords_tsnescatterplot(word2vec_model, 'calorie')
word2vec_model.wv.most_similar("calorie")
word2vec_model.wv.most_similar("family")
display_closestwords_tsnescatterplot(word2vec_model, 'seasoning')
word2vec_model.wv.most_similar("seasoning")
word2vec_model.wv.most_similar("sauce")
word2vec_model.wv.most_similar("bland")
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import seaborn as sns
df = review[review['score'] != 3]
df['sentiment'] = df['score'].apply(lambda score: 'positive' if score >3 else 'negative')
df.head()
display(df['score'].value_counts())
display(df['sentiment'].value_counts())
positive_points = df[df['sentiment'] == 'positive'].sample(n=500)
negative_points = df[df['sentiment'] == 'negative'].sample(n=500)
# Concatenating both of above
total_points = pd.concat([positive_points, negative_points])
count_vect = CountVectorizer(ngram_range=(1,1))
# Initializing vectorizer for bigram
count_vect = CountVectorizer(ngram_range=(1,1))
# Initializing standard scaler
std_scaler = StandardScaler(with_mean=False)
# Creating count vectors and converting into dense representation
sample_points = total_points['cleaned_text']
sample_points = count_vect.fit_transform(sample_points)
sample_points = std_scaler.fit_transform(sample_points)
sample_points = sample_points.todense()
# Storing class label in variable
labels = total_points['sentiment']
# Getting shape
print(sample_points.shape, labels.shape)
# TSNE for Word2vec
tsne_data = sample_points
tsne_labels = labels
# Initializing with most explained variance
model = TSNE(n_components=2, random_state=15, perplexity=20, n_iter=2000)
# Fitting model
tsne_data = model.fit_transform(tsne_data)
# Adding labels to the data point
tsne_data = np.vstack((tsne_data.T, tsne_labels)).T
# Creating data frame
tsne_df = pd.DataFrame(data=tsne_data, columns=('Dim_1', 'Dim_2', 'label'))
# Plotting graph for class labels
sns.FacetGrid(tsne_df, hue='label', size=5).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
plt.title("TSNE with default parameters")
plt.xlabel("Dim_1")
plt.ylabel("Dim_2")
plt.rc('figure', figsize=(12, 15))
plt.show()
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
review['sentiment'] = review['score'].apply(lambda score: 'positive' if score >3 else 'negative')
review.head()
positive = review[review['sentiment'] == 'positive']
positive.head()
negative = review[review['sentiment'] == 'positive']
negative.head()
positive.to_csv('postive.csv')
negative.to_csv('negative.csv')
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)
stopwords.add('like')
stopwords.add('product')
stopwords.add('bought')
stopwords.add('okay')
stopwords.add('buy')
stopwords.add('ok')
stopwords.add('love')
stopwords.add('enjoy')
stopwords.add('buying')
stopwords.add('better')
stopwords.add('great')
stopwords.add('good')
def show_wordcloud(data, title = None):
wordcloud = WordCloud(
background_color='black',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=3,
collocations=False,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(str(data))
fig = plt.figure(1, figsize=(8, 8))
plt.axis('off')
if title:
fig.suptitle(title, fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
show_wordcloud(review[review.score == 5]["review"], title = "High scored words")
show_wordcloud(review[review.score == 1]["review"], title = "Low scored words")
from gensim import corpora
from gensim import models
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc] # bag of words (단어가 있나 - 1, 없나 - 0)
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, # 벡터화한 단어와 단어 사전 압력
num_topics=2, random_state = 1)
for t in lda.show_topics():
print(t)
wc = WordCloud(background_color='black')
plt.figure(figsize=(30,30))
for t in range(lda.num_topics):
plt.subplot(5,4,t+1)
x = dict(lda.show_topic(t,200))
im = wc.generate_from_frequencies(x)
plt.imshow(im)
plt.axis("off")
plt.title("Topic #" + str(t))
plt.show()
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis)
#drop empty rows
review.replace(' ',np.nan , inplace = True) #filling the space with nan value and then remove it
review.dropna(axis = 0 , inplace = True)
# df = review[review['score'] != 3]
df = review
X = df['cleaned_text']
y_dict = {1.0:0, 2.0:0, 3.0:1, 4.0:1, 5.0:1}
y = df['score'].map(y_dict)
import seaborn as sns
sns.countplot(y)
plt.rc('figure', figsize=(5, 2))
plt.show()
# logistic regression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
import warnings
warnings.filterwarnings("ignore")
c = CountVectorizer(stop_words = 'english')
def text_fit(X, y, model,clf_model,coef_show=1):
X_c = model.fit_transform(X)
print('# features: {}'.format(X_c.shape[1]))
X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
print('# train records: {}'.format(X_train.shape[0]))
print('# test records: {}'.format(X_test.shape[0]))
clf = clf_model.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print ('Model Accuracy: {}'.format(acc))
if coef_show == 1:
w = model.get_feature_names()
coef = clf.coef_.tolist()[0]
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print('')
print('-Top 20 positive-')
print(coeff_df.head(20).to_string(index=False))
print('')
print('-Top 20 negative-')
print(coeff_df.tail(20).to_string(index=False))
text_fit(X, y, c, LogisticRegression())
text_fit(X, y, c, DummyClassifier(),0)
tfidf_n = TfidfVectorizer(ngram_range=(1,2),stop_words = stop_words)
text_fit(X, y, tfidf_n, LogisticRegression())
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
review['sentiment'] = review['score'].apply(lambda score: 'positive' if score >= 3 else 'negative')
train, test = train_test_split(review, test_size=0.2)
countVector = CountVectorizer(min_df = 1, ngram_range = (1, 4))
X_train_counts = countVector.fit_transform(train["cleaned_text"])
#applying tfidf to term frequency
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_new_counts = countVector.transform(test["cleaned_text"])
X_test_tfidf = tfidf_transformer.transform(X_new_counts)
y_train = train["sentiment"]
y_test = test["sentiment"]
prediction = dict()
# Multinomial Naïve Bayes# Naïve Bayes
model = MultinomialNB().fit(X_train_tfidf, y_train)
prediction['Multinomial'] = model.predict(X_test_tfidf)
# Bernoulli Naïve Bayes
model = BernoulliNB().fit(X_train_tfidf, y_train)
prediction['Bernoulli'] = model.predict(X_test_tfidf)
# Logistic regression
logreg = LogisticRegression(C=1e5)
logreg_result = logreg.fit(X_train_tfidf, y_train)
prediction['Logistic'] = logreg.predict(X_test_tfidf)
def formatt(x):
if x == 'negative':
return 0
return 1
vfunc = np.vectorize(formatt)
cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']
for model, predicted in prediction.items():
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.map(formatt), vfunc(predicted))
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
cmp += 1
plt.title('Classifiers comparaison with ROC')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.rc('figure', figsize=(20, 12))
plt.show()
print(metrics.classification_report(y_test, prediction['Logistic'], target_names = ["positive", "negative"]))
accuracy_score(y_test, prediction['Logistic'])
features = countVector.get_feature_names()
feature_coefs = pd.DataFrame(
data = list(zip(features, logreg_result.coef_[0])),
columns = ['feature', 'coefficient'])
feature_coefs.sort_values(by='coefficient')[:30]
import networkx as nx
import nltk
tagged_sents = [nltk.pos_tag(sentence) for sentence in tokenized_doc]
import re
for token, tag in tagged_sents[0]:
if re.match(r'NN*|JJ*', tag):
print (token, tag)
noun_phrases = [[token for token, tag in sent if re.match(r'NN*|JJ*', tag)]
for sent in tagged_sents]
import itertools as it
edgelist = [edge for phrase in noun_phrases for edge in it.combinations(phrase, 2)]
G = nx.Graph(edgelist)
index = nx.betweenness_centrality(G)
sorted_index = sorted(index.items(), key=lambda x:x[1], reverse=True)
# Top 10 noun phrases by betweenness centrality:
for word, centr in sorted_index[:10]:
print (word, centr)
G.size()
%pylab inline
%config InlineBackend.figure_format = 'png'
plt.rc('figure', figsize=(30, 25))
G.remove_nodes_from([n for n in index if index[n] == .0])
node_size = [index[n]*10000 for n in G]
pos = nx.spring_layout(G)
nx.draw_networkx(G, pos, node_size=node_size, node_color='#A0CBE2', edge_color='white', alpha=.5, linewidths=15)
df = pd.read_csv('amazon_comment_bugak_result.csv')
df = df.loc[:,['comment', 'star_rating']]
df['star_rating'] = df["star_rating"].apply(name_cut)
df.head()
sns.countplot(data = df, x = 'star_rating')
plt.rc('figure', figsize=(12, 7))
plt.show()
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
lmtzr = WordNetLemmatizer()
negation = re.compile(r"(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)$)|n't",re.I)
clp = re.compile(r"^[.:;!?]$",re.I)
def extract_words_from_comments(df):
comments_tok = []
for index, datapoint in df.iterrows():
tokenized_words = word_tokenize(datapoint["comment"].lower(),language='english')
pos_tagged_words = pos_tag(tokenized_words)
tokenized_words = ["_".join([lmtzr.lemmatize(i[0]),i[1]]) for i in pos_tagged_words if (i[0] not in stopwords.words("english") and len(i[0]) > 2)]
comments_tok.append(tokenized_words)
df["comment_tok"] = comments_tok
return df
df = extract_words_from_comments(df)
print (df.head())
print (df.shape)
from gensim import matutils, corpora, models
def vectorize_comments(df):
d = corpora.Dictionary(df['comment_tok'])
d.filter_extremes(no_below=2, no_above=0.8)
d.compactify()
corpus = [d.doc2bow(text) for text in df['comment_tok']]
corpus = matutils.corpus2csc(corpus, num_terms = len(d.token2id))
corpus = corpus.transpose()
return d, corpus
dictionary, corpus = vectorize_comments(df)
print(corpus.shape)
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle
def train_classifier(X,y):
n_estimators = [100]
min_samples_split = [2]
min_samples_leaf = [1]
bootstrap = [True]
parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
'min_samples_split': min_samples_split}
clf = GridSearchCV(RFC(verbose=1,n_jobs=4), cv=4, param_grid=parameters)
clf.fit(X, y)
return clf
X_train, X_test, y_train, y_test = train_test_split(corpus, df["star_rating"], random_state=0)
classifier = train_classifier(X_train,y_train)
print (classifier.best_score_, "----------------Best Accuracy score on Cross Validation Sets")
print (classifier.score(X_test,y_test))
f = open("Output.txt","w")
f.write("Best Accuracy score on Cross Validation Sets %f" %classifier.best_score_,)
f.write("Score on Test Set %f" %classifier.score(X_test,y_test))
f.close()
import gender_guesser.detector as gender
g = gender.Detector()
print(g.get_gender(u"Bob"))
review['author']
gender = []
for a in review['author']:
gender.append(g.get_gender(a))
while 'unknown' in gender:
gender.remove('unknown')
sns.countplot(gender)
plt.rc('figure', figsize=(14, 10))
plt.show()
!pip install git+git://github.com/clintval/gender_predictor.git
from gender_predictor import GenderPredictor
gp = GenderPredictor()
gp.train_and_test()
gp.classify('Aldo')
gender = []
for ca in review['author']:
gender.append(gp.classify(ca))
sns.countplot(gender)
plt.rc('figure', figsize=(12, 7))
plt.show()
amazon['comment_date'][2].split(' ')[0]
amazon['comment_date'][2].split(' ')[2]
# month
def name_cut(x):
x = x.split(' ')[0]
return x
amazon["month"] = amazon["comment_date"].apply(name_cut)
amazon.head()
# year
def name_cut(x):
x = x.split(' ')[2]
return x
amazon["year"] = amazon["comment_date"].apply(name_cut)
amazon.head()
import datetime
timeStr = '2018-07-28 12:11:32'
Thistime = datetime.datetime.strptime(timeStr, '%Y-%m-%d %H:%M:%S')
print(Thistime)
import matplotlib.patches as mpatches
plt.title('comment_date')
sns.pointplot(x='comment_date', y='comment', data = amazon, color = 'skyblue', label = 'positive')
positive = mpatches.Patch(color='skyblue', label='positive')
plt.legend(handles = [positive])
sns.pointplot(x='month', y='predict', data = amazon, color = 'orange', label = 'negative')
negative = mpatches.Patch(color='orange', label='negative')
plt.legend(handles = [positive, negative])
plt.ylim(300, 750)